During the quality control (QC) steps, we have accumulated different doublet annotations:
Although we consider cell hashing as ground-truth, it has some limitations. First, it cannot detect intra-batch doublets (doublets that share the same hashtag). Second, we observed a marked variability in hashing efficiency across libraries (measured with the signal-to-noise ratio), which suggests that for some libraries the detection was not perfect. Finally, as we wanted to measure the effect of hashing on transcriptional profiles, we included several non-hashed libraries, for which we have no experimental doublet annotation.
To overcome these issues, we ran scrublet, which predicts doublets computationally. Although it cannot find homotypic doublets (composed of cells that share the same transcriptional state), it will help us to accumulate independent sources of evidence that we can visualize downstream.
Of particular note, both cell hashing and scrublet were run for each library independently. Thus, we aimed to combined both approaches in a single metric that can consider all cells in the dataset, hence increasing the statistical power to detect doublets. Here, we calculate the proportion of doublet nearest neighbors (pDNN) using the KNN graph computed in the previous notebook. This metric is inspired in the proportion of artificial nearest neighbors (pANN) described in the DoubletFinder article. Following this principle, we expect to obtain a bimodal pDNN distribution that allows us to easily separate singlets and homotypic doublets from heterotypic doublets.
library(Seurat)
library(Matrix)
library(tidyverse)
# Paths
path_to_knn <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/1-doublet_exclusion/tmp/integrated_knn_graph.rds")
path_to_doubl_annot <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/1-doublet_exclusion/tmp/doublet_preliminary_annotations.rds")
path_to_save <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/1-doublet_exclusion/tmp/doublet_final_annotations_and_pDNN.rds")
# Functions
source(here::here("scRNA-seq/bin/utils.R"))
# Thresholds
k <- 75
knn_graph <- readRDS(path_to_knn)
doublet_annot <- readRDS(path_to_doubl_annot)
For a fair comparison, we will only compare doublet annotations for hashed cells:
is_hashed <- doublet_annot$HTO_classification.global != "NA"
doubl_annot_hashing <- doublet_annot$HTO_classification.global[is_hashed]
doubl_annot_scrublet <- doublet_annot$scrublet_predicted_doublet[is_hashed]
tabled_annotations <- table(
hashing = factor(doubl_annot_hashing, c("Singlet", "Doublet")),
scrublet = doubl_annot_scrublet
)
tabled_annotations
## scrublet
## hashing FALSE TRUE
## Singlet 199585 17028
## Doublet 59041 21536
As we can see, the overlap between scrublet and cell hashing is relatively small. This is expected, since these two approaches have complementary limitations (discussed above).
# Order levels HTO classification for later plots
doublet_annot$HTO_classification.global <- factor(
doublet_annot$HTO_classification.global,
levels = c("Singlet", "Doublet", "NA")
)
levels(doublet_annot$HTO_classification.global) <- c(
"Singlet",
"Doublet",
"Not Hashed"
)
# Calculate pDNN
doublets_hashing <- which(doublet_annot$HTO_classification.global == "Doublet")
knn_graph_hashing <- knn_graph[, doublets_hashing]
doublet_annot$pDNN_hashing <- Matrix::rowSums(knn_graph_hashing) / k
# Plot
label <- "pDNN (cell hashing)"
pDNN_hashing_hist <- doublet_annot %>%
plot_histogram_doublets(x = "pDNN_hashing", x_lab = label, bins = 30)
pDNN_hashing_density <- doublet_annot %>%
plot_density_doublets(
x = "pDNN_hashing",
x_lab = label,
color = "HTO_classification.global",
color_lab = "HTO classification"
)
pDNN_hashing_boxplot <- doublet_annot %>%
plot_boxplot_doublets(
x = "HTO_classification.global",
y = "pDNN_hashing",
fill = "HTO_classification.global",
y_lab = label
)
scrublet_label <- "Doublet Score (scrublet)"
pDNN_hashing_scatter <- doublet_annot %>%
plot_scatter_doublets(
x = "scrublet_doublet_scores",
y = "pDNN_hashing",
x_lab = scrublet_label,
y_lab = label
)
# Show plots
pDNN_hashing_hist
pDNN_hashing_density
pDNN_hashing_boxplot
pDNN_hashing_scatter
# Calculate pDNN
doublets_scrublet <- which(doublet_annot$scrublet_predicted_doublet)
knn_graph_scrublet <- knn_graph[, doublets_scrublet]
doublet_annot$pDNN_scrublet <- Matrix::rowSums(knn_graph_scrublet) / k
# Plot
label <- "pDNN (scrublet)"
pDNN_scrublet_hist <- doublet_annot %>%
plot_histogram_doublets(x = "pDNN_scrublet", x_lab = label, bins = 30)
pDNN_scrublet_density <- doublet_annot %>%
plot_density_doublets(
x = "pDNN_scrublet",
x_lab = label,
color = "HTO_classification.global",
color_lab = "HTO classification"
)
pDNN_scrublet_boxplot <- doublet_annot %>%
plot_boxplot_doublets(
x = "HTO_classification.global",
y = "pDNN_scrublet",
fill = "HTO_classification.global",
y_lab = label
)
pDNN_scrublet_scatter <- doublet_annot %>%
plot_scatter_doublets(
x = "scrublet_doublet_scores",
y = "pDNN_scrublet",
x_lab = scrublet_label,
y_lab = label
)
pDNN_scrublet_hashing <- doublet_annot %>%
plot_scatter_doublets(
x = "pDNN_hashing",
y = "pDNN_scrublet",
x_lab = "pDNN (cell hashing)",
y_lab = label
)
# Show plots
pDNN_scrublet_hist
pDNN_scrublet_density
pDNN_scrublet_boxplot
pDNN_scrublet_scatter
pDNN_scrublet_hashing
# Calculate pDNN
all_doublets <-
doublet_annot$HTO_classification.global == "Doublet" |
doublet_annot$scrublet_predicted_doublet
doublets_union <- which(all_doublets)
knn_graph_union <- knn_graph[, doublets_union]
doublet_annot$pDNN_union <- Matrix::rowSums(knn_graph_union) / k
# Plot
label <- "pDNN (cell hashing or scrublet)"
pDNN_union_hist <- doublet_annot %>%
plot_histogram_doublets(x = "pDNN_union", x_lab = label, bins = 30)
pDNN_union_density <- doublet_annot %>%
plot_density_doublets(
x = "pDNN_union",
x_lab = label,
color = "HTO_classification.global",
color_lab = "HTO classification"
)
pDNN_union_boxplot <- doublet_annot %>%
plot_boxplot_doublets(
x = "HTO_classification.global",
y = "pDNN_union",
fill = "HTO_classification.global",
y_lab = label
)
pDNN_union_scatter <- doublet_annot %>%
plot_scatter_doublets(
x = "scrublet_doublet_scores",
y = "pDNN_union",
x_lab = scrublet_label,
y_lab = label
)
# Show plots
pDNN_union_hist
pDNN_union_density
pDNN_union_boxplot
pDNN_union_scatter
Interestingly, considering both annotations yields a better separation of singlets/homotypic doublets and heterotypic doublets.
Initially, we will be very permissive and only exclude doublets annotated with cell hashing. We will keep both the pDNN and scrublet variables in the metadata, which we will leverage downstream to rule out doublets at the cluster level. This approach will help us both remove doublets and understand its impacts on the data.
table(doublet_annot$HTO_classification.global == "Doublet")
##
## FALSE TRUE
## 266685 80577
In total, we will eliminate 80577 doublets.
Finally, we will keep the cells that have and outlier library size, as they could represent a specific cell type.
table(doublet_annot$has_high_lib_size)
##
## FALSE TRUE
## 344761 2501
saveRDS(doublet_annot, path_to_save)
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server release 6.7 (Santiago)
##
## Matrix products: default
## BLAS: /apps/R/3.6.0/lib64/R/lib/libRblas.so
## LAPACK: /home/devel/rmassoni/anaconda3/lib/libmkl_rt.so
##
## locale:
## [1] LC_CTYPE=C LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.3.1 tidyr_1.1.0 tibble_3.0.1 ggplot2_3.3.0 tidyverse_1.3.0 Matrix_1.2-18 Seurat_3.2.0 BiocStyle_2.14.4
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-1 deldir_0.1-25 ellipsis_0.3.1 ggridges_0.5.2 rprojroot_1.3-2 fs_1.4.1 rstudioapi_0.11 spatstat.data_1.4-3 farver_2.0.3 leiden_0.3.3 listenv_0.8.0 ggrepel_0.8.2 fansi_0.4.1 lubridate_1.7.8 xml2_1.3.2 codetools_0.2-16 splines_3.6.0 knitr_1.28 polyclip_1.10-0 jsonlite_1.7.2 broom_0.5.6 ica_1.0-2 cluster_2.1.0 dbplyr_1.4.4 png_0.1-7 uwot_0.1.8 shiny_1.4.0.2 sctransform_0.2.1 BiocManager_1.30.10 compiler_3.6.0 httr_1.4.2 backports_1.1.7 assertthat_0.2.1 fastmap_1.0.1 lazyeval_0.2.2 cli_2.0.2 later_1.0.0 htmltools_0.4.0 tools_3.6.0 rsvd_1.0.3 igraph_1.2.5 gtable_0.3.0 glue_1.4.1 RANN_2.6.1 reshape2_1.4.4 rappdirs_0.3.1 Rcpp_1.0.6 spatstat_1.64-1 cellranger_1.1.0 vctrs_0.3.6 ape_5.3 nlme_3.1-148 lmtest_0.9-37
## [55] xfun_0.14 globals_0.12.5 rvest_0.3.5 mime_0.9 miniUI_0.1.1.1 lifecycle_0.2.0 irlba_2.3.3 goftest_1.2-2 future_1.17.0 MASS_7.3-51.6 zoo_1.8-8 scales_1.1.1 hms_0.5.3 promises_1.1.0 spatstat.utils_1.17-0 parallel_3.6.0 RColorBrewer_1.1-2 yaml_2.2.1 reticulate_1.16 pbapply_1.4-2 gridExtra_2.3 rpart_4.1-15 stringi_1.4.6 rlang_0.4.10 pkgconfig_2.0.3 evaluate_0.14 lattice_0.20-41 ROCR_1.0-11 tensor_1.5 labeling_0.3 patchwork_1.0.0 htmlwidgets_1.5.1 cowplot_1.0.0 tidyselect_1.1.0 here_0.1 RcppAnnoy_0.0.16 plyr_1.8.6 magrittr_1.5 bookdown_0.19 R6_2.4.1 generics_0.0.2 DBI_1.1.0 withr_2.4.1 pillar_1.4.4 haven_2.3.1 mgcv_1.8-31 fitdistrplus_1.1-1 survival_3.1-12 abind_1.4-5 future.apply_1.5.0 modelr_0.1.8 crayon_1.3.4 KernSmooth_2.23-17 plotly_4.9.2.1
## [109] rmarkdown_2.2 readxl_1.3.1 grid_3.6.0 data.table_1.12.8 blob_1.2.1 reprex_0.3.0 digest_0.6.20 xtable_1.8-4 httpuv_1.5.3.1 munsell_0.5.0 viridisLite_0.3.0